{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pandas import DataFrame\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 处理丢失数据\n",
    "- 有两种丢失数据：\n",
    "    - None\n",
    "    - np.nan(NaN)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 两种丢失数据的区别"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "NoneType"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "float"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(np.nan)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 为什么在数据分析中需要用到的是浮点类型的空而不是对象类型？\n",
    "    - 数据分析中会常常使用某些形式的运算来处理原始数据，如果原数数据中的空值为NAN的形式，则不会干扰或者中断运算。\n",
    "    - NAN可以参与运算的\n",
    "    - None是不可以参与运算"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "nan"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.nan + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "ename": "TypeError",
     "evalue": "unsupported operand type(s) for +: 'NoneType' and 'int'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[5], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
      "\u001b[1;31mTypeError\u001b[0m: unsupported operand type(s) for +: 'NoneType' and 'int'"
     ]
    }
   ],
   "source": [
    "# None + 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 在pandas中如果遇到了None形式的空值则pandas会将其强转成NAN的形式。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5</td>\n",
       "      <td>52</td>\n",
       "      <td>66.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>31.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>81</td>\n",
       "      <td>70</td>\n",
       "      <td>17.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>49.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>71</td>\n",
       "      <td>59</td>\n",
       "      <td>90.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>33.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>22</td>\n",
       "      <td>98</td>\n",
       "      <td>19.0</td>\n",
       "      <td>50.0</td>\n",
       "      <td>51.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>73</td>\n",
       "      <td>29</td>\n",
       "      <td>NaN</td>\n",
       "      <td>78.0</td>\n",
       "      <td>34.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>93</td>\n",
       "      <td>3</td>\n",
       "      <td>24.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>16</td>\n",
       "      <td>60</td>\n",
       "      <td>95.0</td>\n",
       "      <td>26.0</td>\n",
       "      <td>25.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1     2     3     4\n",
       "0   5  52  66.0   0.0  31.0\n",
       "1  81  70  17.0  64.0  49.0\n",
       "2  71  59  90.0   NaN  33.0\n",
       "3  22  98  19.0  50.0  51.0\n",
       "4  73  29   NaN  78.0  34.0\n",
       "5  93   3  24.0  12.0   NaN\n",
       "6  16  60  95.0  26.0  25.0"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = DataFrame(data=np.random.randint(0,100,size=(7,5)))\n",
    "df.iloc[2,3] = None\n",
    "df.iloc[4,2] = np.nan\n",
    "df.iloc[5,4] = None\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### pandas处理空值操作\n",
    "- isnull\n",
    "- notnull\n",
    "- any\n",
    "- all\n",
    "- dropna\n",
    "- fillna"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    False\n",
       "1    False\n",
       "2    False\n",
       "3     True\n",
       "4    False\n",
       "dtype: bool"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s1 = pd.Series([1,2,3,None,4])\n",
    "s1.isnull()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 方式1：对空值进行过滤（删除空所在的行数据）\n",
    "    - 技术：isnull，notnull，any，all"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       0      1      2      3      4\n",
       "0  False  False  False  False  False\n",
       "1  False  False  False  False  False\n",
       "2  False  False  False   True  False\n",
       "3  False  False  False  False  False\n",
       "4  False  False   True  False  False\n",
       "5  False  False  False  False   True\n",
       "6  False  False  False  False  False"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.isnull()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    False\n",
       "1    False\n",
       "2     True\n",
       "3     True\n",
       "4     True\n",
       "dtype: bool"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#哪些行中有空值\n",
    "#any(axis=1)检测哪些行中存有空值\n",
    "df.isnull().any(axis=1) #any会作用isnull返回结果的每一行\n",
    "#true对应的行就是存有缺失数据的行"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      0     1      2      3      4\n",
       "0  True  True   True   True   True\n",
       "1  True  True   True   True   True\n",
       "2  True  True   True  False   True\n",
       "3  True  True   True   True   True\n",
       "4  True  True  False   True   True\n",
       "5  True  True   True   True  False\n",
       "6  True  True   True   True   True"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.notnull()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     True\n",
       "1     True\n",
       "2    False\n",
       "3     True\n",
       "4    False\n",
       "5    False\n",
       "6     True\n",
       "dtype: bool"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "is_list = df.notnull().all(axis=1)\n",
    "\n",
    "is_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5</td>\n",
       "      <td>52</td>\n",
       "      <td>66.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>31.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>81</td>\n",
       "      <td>70</td>\n",
       "      <td>17.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>49.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>22</td>\n",
       "      <td>98</td>\n",
       "      <td>19.0</td>\n",
       "      <td>50.0</td>\n",
       "      <td>51.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>16</td>\n",
       "      <td>60</td>\n",
       "      <td>95.0</td>\n",
       "      <td>26.0</td>\n",
       "      <td>25.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1     2     3     4\n",
       "0   5  52  66.0   0.0  31.0\n",
       "1  81  70  17.0  64.0  49.0\n",
       "3  22  98  19.0  50.0  51.0\n",
       "6  16  60  95.0  26.0  25.0"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#只输出行没有空值的数据\n",
    "df.loc[is_list]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5</td>\n",
       "      <td>52</td>\n",
       "      <td>66.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>31.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>81</td>\n",
       "      <td>70</td>\n",
       "      <td>17.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>49.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1     2     3     4\n",
       "0   5  52  66.0   0.0  31.0\n",
       "1  81  70  17.0  64.0  49.0"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[[True,True,False,False,False,False,False]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>60</td>\n",
       "      <td>20</td>\n",
       "      <td>95.0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>66.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>92</td>\n",
       "      <td>27</td>\n",
       "      <td>66.0</td>\n",
       "      <td>7.0</td>\n",
       "      <td>94.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>61</td>\n",
       "      <td>7</td>\n",
       "      <td>51.0</td>\n",
       "      <td>81.0</td>\n",
       "      <td>87.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>80</td>\n",
       "      <td>48</td>\n",
       "      <td>1.0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1     2     3     4\n",
       "0  60  20  95.0  19.0  66.0\n",
       "1  92  27  66.0   7.0  94.0\n",
       "3  61   7  51.0  81.0  87.0\n",
       "6  80  48   1.0  19.0   0.0"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#将布尔值作为源数据的行索引\n",
    "df.loc[df.notnull().all(axis=1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>91</td>\n",
       "      <td>10</td>\n",
       "      <td>12.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>29.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>96</td>\n",
       "      <td>67</td>\n",
       "      <td>NaN</td>\n",
       "      <td>96.0</td>\n",
       "      <td>70.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>92</td>\n",
       "      <td>21</td>\n",
       "      <td>61.0</td>\n",
       "      <td>58.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1     2     3     4\n",
       "2  91  10  12.0   NaN  29.0\n",
       "4  96  67   NaN  96.0  70.0\n",
       "5  92  21  61.0  58.0   NaN"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#获取空对应的行数据\n",
    "df.loc[df.isnull().any(axis=1)]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index([2, 4, 5], dtype='int64')"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#获取空对应行数据的行索引\n",
    "indexs = df.loc[df.isnull().any(axis=1)].index\n",
    "indexs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>60</td>\n",
       "      <td>20</td>\n",
       "      <td>95.0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>66.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>92</td>\n",
       "      <td>27</td>\n",
       "      <td>66.0</td>\n",
       "      <td>7.0</td>\n",
       "      <td>94.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>61</td>\n",
       "      <td>7</td>\n",
       "      <td>51.0</td>\n",
       "      <td>81.0</td>\n",
       "      <td>87.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>80</td>\n",
       "      <td>48</td>\n",
       "      <td>1.0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1     2     3     4\n",
       "0  60  20  95.0  19.0  66.0\n",
       "1  92  27  66.0   7.0  94.0\n",
       "3  61   7  51.0  81.0  87.0\n",
       "6  80  48   1.0  19.0   0.0"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#对原本的数据进行删除操作\n",
    "df.drop(labels=indexs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 方式2：\n",
    "    - dropna：可以直接将缺失的行或者列进行删除"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5</td>\n",
       "      <td>52</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>81</td>\n",
       "      <td>70</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>71</td>\n",
       "      <td>59</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>22</td>\n",
       "      <td>98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>73</td>\n",
       "      <td>29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>93</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>16</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1\n",
       "0   5  52\n",
       "1  81  70\n",
       "2  71  59\n",
       "3  22  98\n",
       "4  73  29\n",
       "5  93   3\n",
       "6  16  60"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dropna(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5</td>\n",
       "      <td>52</td>\n",
       "      <td>66.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>31.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>81</td>\n",
       "      <td>70</td>\n",
       "      <td>17.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>49.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>71</td>\n",
       "      <td>59</td>\n",
       "      <td>90.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>33.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>22</td>\n",
       "      <td>98</td>\n",
       "      <td>19.0</td>\n",
       "      <td>50.0</td>\n",
       "      <td>51.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>73</td>\n",
       "      <td>29</td>\n",
       "      <td>NaN</td>\n",
       "      <td>78.0</td>\n",
       "      <td>34.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>93</td>\n",
       "      <td>3</td>\n",
       "      <td>24.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>16</td>\n",
       "      <td>60</td>\n",
       "      <td>95.0</td>\n",
       "      <td>26.0</td>\n",
       "      <td>25.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1     2     3     4\n",
       "0   5  52  66.0   0.0  31.0\n",
       "1  81  70  17.0  64.0  49.0\n",
       "2  71  59  90.0   NaN  33.0\n",
       "3  22  98  19.0  50.0  51.0\n",
       "4  73  29   NaN  78.0  34.0\n",
       "5  93   3  24.0  12.0   NaN\n",
       "6  16  60  95.0  26.0  25.0"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 对缺失值进行覆盖\n",
    "    - fillna"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5</td>\n",
       "      <td>52</td>\n",
       "      <td>66.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>31.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>81</td>\n",
       "      <td>70</td>\n",
       "      <td>17.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>49.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>71</td>\n",
       "      <td>59</td>\n",
       "      <td>90.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>33.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>22</td>\n",
       "      <td>98</td>\n",
       "      <td>19.0</td>\n",
       "      <td>50.0</td>\n",
       "      <td>51.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>73</td>\n",
       "      <td>29</td>\n",
       "      <td>6.0</td>\n",
       "      <td>78.0</td>\n",
       "      <td>34.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>93</td>\n",
       "      <td>3</td>\n",
       "      <td>24.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>16</td>\n",
       "      <td>60</td>\n",
       "      <td>95.0</td>\n",
       "      <td>26.0</td>\n",
       "      <td>25.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1     2     3     4\n",
       "0   5  52  66.0   0.0  31.0\n",
       "1  81  70  17.0  64.0  49.0\n",
       "2  71  59  90.0   6.0  33.0\n",
       "3  22  98  19.0  50.0  51.0\n",
       "4  73  29   6.0  78.0  34.0\n",
       "5  93   3  24.0  12.0   6.0\n",
       "6  16  60  95.0  26.0  25.0"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import random\n",
    "df.fillna(value=0) #使用指定值将源数据中所有的空值进行填充\n",
    "\n",
    "df.fillna(value = random.randint(0,10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\20761\\AppData\\Local\\Temp\\ipykernel_13148\\3033243836.py:3: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.\n",
      "  df.fillna(axis=0,method='bfill')\n",
      "C:\\Users\\20761\\AppData\\Local\\Temp\\ipykernel_13148\\3033243836.py:4: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.\n",
      "  df.fillna(axis=0,method='ffill')\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5</td>\n",
       "      <td>52</td>\n",
       "      <td>66.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>31.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>81</td>\n",
       "      <td>70</td>\n",
       "      <td>17.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>49.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>71</td>\n",
       "      <td>59</td>\n",
       "      <td>90.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>33.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>22</td>\n",
       "      <td>98</td>\n",
       "      <td>19.0</td>\n",
       "      <td>50.0</td>\n",
       "      <td>51.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>73</td>\n",
       "      <td>29</td>\n",
       "      <td>19.0</td>\n",
       "      <td>78.0</td>\n",
       "      <td>34.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>93</td>\n",
       "      <td>3</td>\n",
       "      <td>24.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>34.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>16</td>\n",
       "      <td>60</td>\n",
       "      <td>95.0</td>\n",
       "      <td>26.0</td>\n",
       "      <td>25.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1     2     3     4\n",
       "0   5  52  66.0   0.0  31.0\n",
       "1  81  70  17.0  64.0  49.0\n",
       "2  71  59  90.0  64.0  33.0\n",
       "3  22  98  19.0  50.0  51.0\n",
       "4  73  29  19.0  78.0  34.0\n",
       "5  93   3  24.0  12.0  34.0\n",
       "6  16  60  95.0  26.0  25.0"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#使用空的近邻值进行填充\n",
    "#method=ffill向前填充，bfill向后填充\n",
    "df.fillna(axis=0,method='bfill')\n",
    "df.fillna(axis=0,method='ffill')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 什么时候用dropna什么时候用fillna\n",
    "    - 尽量使用dropna，如果删除成本比较高，则使用fillna"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 使用空值对应列的均值进行空值填充"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "57.666666666666664"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for col in df.columns:#检查每一列\n",
    "    #检测哪些列中存有空值\n",
    "    if df[col].isnull().sum() > 0:#说明df[col]中存有空值\n",
    "        mean_value = df[col].mean() #某段计算得来的科学值\n",
    "        df[col] = df[col].fillna(value=np.around(mean_value))\n",
    "        # df[col] = df[col].fillna(value=random.randint(0,10))\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>60</td>\n",
       "      <td>20</td>\n",
       "      <td>95.0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>66.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>92</td>\n",
       "      <td>27</td>\n",
       "      <td>66.0</td>\n",
       "      <td>7.0</td>\n",
       "      <td>94.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>91</td>\n",
       "      <td>10</td>\n",
       "      <td>12.0</td>\n",
       "      <td>47.0</td>\n",
       "      <td>29.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>61</td>\n",
       "      <td>7</td>\n",
       "      <td>51.0</td>\n",
       "      <td>81.0</td>\n",
       "      <td>87.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>96</td>\n",
       "      <td>67</td>\n",
       "      <td>48.0</td>\n",
       "      <td>96.0</td>\n",
       "      <td>70.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>92</td>\n",
       "      <td>21</td>\n",
       "      <td>61.0</td>\n",
       "      <td>58.0</td>\n",
       "      <td>58.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>80</td>\n",
       "      <td>48</td>\n",
       "      <td>1.0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1     2     3     4\n",
       "0  60  20  95.0  19.0  66.0\n",
       "1  92  27  66.0   7.0  94.0\n",
       "2  91  10  12.0  47.0  29.0\n",
       "3  61   7  51.0  81.0  87.0\n",
       "4  96  67  48.0  96.0  70.0\n",
       "5  92  21  61.0  58.0  58.0\n",
       "6  80  48   1.0  19.0   0.0"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 面试题\n",
    "- 数据说明： \n",
    "    - 数据是1个冷库的温度数据，1-7对应7个温度采集设备，1分钟采集一次。\n",
    "\n",
    "- 数据处理目标：\n",
    "    - 用1-4对应的4个必须设备，通过建立冷库的温度场关系模型，预估出5-7对应的数据。\n",
    "    - 最后每个冷库中仅需放置4个设备，取代放置7个设备。\n",
    "    - f(1-4) --> y(5-7)\n",
    "\n",
    "- 数据处理过程：\n",
    "    - 1、原始数据中有丢帧现象，需要做预处理；\n",
    "    - 2、matplotlib 绘图；\n",
    "    - 3、建立逻辑回归模型。\n",
    "\n",
    "- 无标准答案，按个人理解操作即可，请把自己的操作过程以文字形式简单描述一下，谢谢配合。\n",
    "\n",
    "- 测试数据为testData.xlsx\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>购药时间</th>\n",
       "      <th>社保卡号</th>\n",
       "      <th>商品编码</th>\n",
       "      <th>商品名称</th>\n",
       "      <th>销售数量</th>\n",
       "      <th>应收金额</th>\n",
       "      <th>实收金额</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2018-01-01 星期五</td>\n",
       "      <td>1.616528e+06</td>\n",
       "      <td>236701.0</td>\n",
       "      <td>强力VC银翘片</td>\n",
       "      <td>6.0</td>\n",
       "      <td>82.8</td>\n",
       "      <td>69.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2018-01-01 星期五</td>\n",
       "      <td>1.014705e+08</td>\n",
       "      <td>236709.0</td>\n",
       "      <td>心痛定</td>\n",
       "      <td>4.0</td>\n",
       "      <td>179.2</td>\n",
       "      <td>159.20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2018-01-01 星期五</td>\n",
       "      <td>1.007261e+10</td>\n",
       "      <td>2367011.0</td>\n",
       "      <td>开博通</td>\n",
       "      <td>1.0</td>\n",
       "      <td>28.0</td>\n",
       "      <td>25.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2018-01-01 星期五</td>\n",
       "      <td>1.007460e+10</td>\n",
       "      <td>2367011.0</td>\n",
       "      <td>开博通</td>\n",
       "      <td>5.0</td>\n",
       "      <td>140.0</td>\n",
       "      <td>125.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2018-01-01 星期五</td>\n",
       "      <td>1.174343e+07</td>\n",
       "      <td>861405.0</td>\n",
       "      <td>苯磺酸氨氯地平片(络活喜)</td>\n",
       "      <td>1.0</td>\n",
       "      <td>34.5</td>\n",
       "      <td>31.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6573</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1.177863e+07</td>\n",
       "      <td>2367011.0</td>\n",
       "      <td>高特灵</td>\n",
       "      <td>10.0</td>\n",
       "      <td>56.0</td>\n",
       "      <td>56.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6574</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6575</th>\n",
       "      <td>2018-04-27 星期三</td>\n",
       "      <td>1.008787e+10</td>\n",
       "      <td>2367011.0</td>\n",
       "      <td>高特灵</td>\n",
       "      <td>2.0</td>\n",
       "      <td>11.2</td>\n",
       "      <td>9.86</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6576</th>\n",
       "      <td>2018-04-27 星期三</td>\n",
       "      <td>1.340663e+07</td>\n",
       "      <td>2367011.0</td>\n",
       "      <td>高特灵</td>\n",
       "      <td>1.0</td>\n",
       "      <td>5.6</td>\n",
       "      <td>5.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6577</th>\n",
       "      <td>2018-04-28 星期四</td>\n",
       "      <td>1.192693e+07</td>\n",
       "      <td>2367011.0</td>\n",
       "      <td>高特灵</td>\n",
       "      <td>2.0</td>\n",
       "      <td>11.2</td>\n",
       "      <td>10.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6578 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                购药时间          社保卡号       商品编码           商品名称  销售数量   应收金额  \\\n",
       "0     2018-01-01 星期五  1.616528e+06   236701.0        强力VC银翘片   6.0   82.8   \n",
       "1     2018-01-01 星期五  1.014705e+08   236709.0            心痛定   4.0  179.2   \n",
       "2     2018-01-01 星期五  1.007261e+10  2367011.0            开博通   1.0   28.0   \n",
       "3     2018-01-01 星期五  1.007460e+10  2367011.0            开博通   5.0  140.0   \n",
       "4     2018-01-01 星期五  1.174343e+07   861405.0  苯磺酸氨氯地平片(络活喜)   1.0   34.5   \n",
       "...              ...           ...        ...            ...   ...    ...   \n",
       "6573             NaN  1.177863e+07  2367011.0            高特灵  10.0   56.0   \n",
       "6574             NaN           NaN        NaN            NaN   NaN    NaN   \n",
       "6575  2018-04-27 星期三  1.008787e+10  2367011.0            高特灵   2.0   11.2   \n",
       "6576  2018-04-27 星期三  1.340663e+07  2367011.0            高特灵   1.0    5.6   \n",
       "6577  2018-04-28 星期四  1.192693e+07  2367011.0            高特灵   2.0   11.2   \n",
       "\n",
       "        实收金额  \n",
       "0      69.00  \n",
       "1     159.20  \n",
       "2      25.00  \n",
       "3     125.00  \n",
       "4      31.00  \n",
       "...      ...  \n",
       "6573   56.00  \n",
       "6574     NaN  \n",
       "6575    9.86  \n",
       "6576    5.00  \n",
       "6577   10.00  \n",
       "\n",
       "[6578 rows x 7 columns]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.read_excel('../resource/朝阳医院2018年销售数据.xlsx') #读取一个excel/csv\n",
    "\n",
    "data.index\n",
    "\n",
    "pd.read_csv('../resource/test.CSV',encoding='GBK')\n",
    "\n",
    "# data = pd.read_excel('../data/testData.xlsx').drop(labels=['none','none1'],axis=1)\n",
    "# data\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(6578, 7)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(6575, 7)"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#删除空对应的行数据\n",
    "data.dropna(axis=0).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\20761\\AppData\\Local\\Temp\\ipykernel_1708\\240439370.py:2: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.\n",
      "  data.fillna(method='ffill',axis=0).fillna(method='bfill',axis=0)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>购药时间</th>\n",
       "      <th>社保卡号</th>\n",
       "      <th>商品编码</th>\n",
       "      <th>商品名称</th>\n",
       "      <th>销售数量</th>\n",
       "      <th>应收金额</th>\n",
       "      <th>实收金额</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2018-01-01 星期五</td>\n",
       "      <td>1.616528e+06</td>\n",
       "      <td>236701.0</td>\n",
       "      <td>强力VC银翘片</td>\n",
       "      <td>6.0</td>\n",
       "      <td>82.8</td>\n",
       "      <td>69.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2018-01-01 星期五</td>\n",
       "      <td>1.014705e+08</td>\n",
       "      <td>236709.0</td>\n",
       "      <td>心痛定</td>\n",
       "      <td>4.0</td>\n",
       "      <td>179.2</td>\n",
       "      <td>159.20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2018-01-01 星期五</td>\n",
       "      <td>1.007261e+10</td>\n",
       "      <td>2367011.0</td>\n",
       "      <td>开博通</td>\n",
       "      <td>1.0</td>\n",
       "      <td>28.0</td>\n",
       "      <td>25.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2018-01-01 星期五</td>\n",
       "      <td>1.007460e+10</td>\n",
       "      <td>2367011.0</td>\n",
       "      <td>开博通</td>\n",
       "      <td>5.0</td>\n",
       "      <td>140.0</td>\n",
       "      <td>125.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2018-01-01 星期五</td>\n",
       "      <td>1.174343e+07</td>\n",
       "      <td>861405.0</td>\n",
       "      <td>苯磺酸氨氯地平片(络活喜)</td>\n",
       "      <td>1.0</td>\n",
       "      <td>34.5</td>\n",
       "      <td>31.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6573</th>\n",
       "      <td>2018-07-19 星期二</td>\n",
       "      <td>1.177863e+07</td>\n",
       "      <td>2367011.0</td>\n",
       "      <td>高特灵</td>\n",
       "      <td>10.0</td>\n",
       "      <td>56.0</td>\n",
       "      <td>56.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6574</th>\n",
       "      <td>2018-07-19 星期二</td>\n",
       "      <td>1.177863e+07</td>\n",
       "      <td>2367011.0</td>\n",
       "      <td>高特灵</td>\n",
       "      <td>10.0</td>\n",
       "      <td>56.0</td>\n",
       "      <td>56.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6575</th>\n",
       "      <td>2018-04-27 星期三</td>\n",
       "      <td>1.008787e+10</td>\n",
       "      <td>2367011.0</td>\n",
       "      <td>高特灵</td>\n",
       "      <td>2.0</td>\n",
       "      <td>11.2</td>\n",
       "      <td>9.86</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6576</th>\n",
       "      <td>2018-04-27 星期三</td>\n",
       "      <td>1.340663e+07</td>\n",
       "      <td>2367011.0</td>\n",
       "      <td>高特灵</td>\n",
       "      <td>1.0</td>\n",
       "      <td>5.6</td>\n",
       "      <td>5.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6577</th>\n",
       "      <td>2018-04-28 星期四</td>\n",
       "      <td>1.192693e+07</td>\n",
       "      <td>2367011.0</td>\n",
       "      <td>高特灵</td>\n",
       "      <td>2.0</td>\n",
       "      <td>11.2</td>\n",
       "      <td>10.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6578 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                购药时间          社保卡号       商品编码           商品名称  销售数量   应收金额  \\\n",
       "0     2018-01-01 星期五  1.616528e+06   236701.0        强力VC银翘片   6.0   82.8   \n",
       "1     2018-01-01 星期五  1.014705e+08   236709.0            心痛定   4.0  179.2   \n",
       "2     2018-01-01 星期五  1.007261e+10  2367011.0            开博通   1.0   28.0   \n",
       "3     2018-01-01 星期五  1.007460e+10  2367011.0            开博通   5.0  140.0   \n",
       "4     2018-01-01 星期五  1.174343e+07   861405.0  苯磺酸氨氯地平片(络活喜)   1.0   34.5   \n",
       "...              ...           ...        ...            ...   ...    ...   \n",
       "6573  2018-07-19 星期二  1.177863e+07  2367011.0            高特灵  10.0   56.0   \n",
       "6574  2018-07-19 星期二  1.177863e+07  2367011.0            高特灵  10.0   56.0   \n",
       "6575  2018-04-27 星期三  1.008787e+10  2367011.0            高特灵   2.0   11.2   \n",
       "6576  2018-04-27 星期三  1.340663e+07  2367011.0            高特灵   1.0    5.6   \n",
       "6577  2018-04-28 星期四  1.192693e+07  2367011.0            高特灵   2.0   11.2   \n",
       "\n",
       "        实收金额  \n",
       "0      69.00  \n",
       "1     159.20  \n",
       "2      25.00  \n",
       "3     125.00  \n",
       "4      31.00  \n",
       "...      ...  \n",
       "6573   56.00  \n",
       "6574   56.00  \n",
       "6575    9.86  \n",
       "6576    5.00  \n",
       "6577   10.00  \n",
       "\n",
       "[6578 rows x 7 columns]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#填充\n",
    "data.fillna(method='ffill',axis=0).fillna(method='bfill',axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>购药时间</th>\n",
       "      <th>社保卡号</th>\n",
       "      <th>商品编码</th>\n",
       "      <th>商品名称</th>\n",
       "      <th>销售数量</th>\n",
       "      <th>应收金额</th>\n",
       "      <th>实收金额</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2018-01-01 星期五</td>\n",
       "      <td>1.616528e+06</td>\n",
       "      <td>236701.0</td>\n",
       "      <td>强力VC银翘片</td>\n",
       "      <td>6.0</td>\n",
       "      <td>82.8</td>\n",
       "      <td>69.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2018-01-01 星期五</td>\n",
       "      <td>1.014705e+08</td>\n",
       "      <td>236709.0</td>\n",
       "      <td>心痛定</td>\n",
       "      <td>4.0</td>\n",
       "      <td>179.2</td>\n",
       "      <td>159.20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2018-01-01 星期五</td>\n",
       "      <td>1.007261e+10</td>\n",
       "      <td>2367011.0</td>\n",
       "      <td>开博通</td>\n",
       "      <td>1.0</td>\n",
       "      <td>28.0</td>\n",
       "      <td>25.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2018-01-01 星期五</td>\n",
       "      <td>1.007460e+10</td>\n",
       "      <td>2367011.0</td>\n",
       "      <td>开博通</td>\n",
       "      <td>5.0</td>\n",
       "      <td>140.0</td>\n",
       "      <td>125.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2018-01-01 星期五</td>\n",
       "      <td>1.174343e+07</td>\n",
       "      <td>861405.0</td>\n",
       "      <td>苯磺酸氨氯地平片(络活喜)</td>\n",
       "      <td>1.0</td>\n",
       "      <td>34.5</td>\n",
       "      <td>31.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6573</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1.177863e+07</td>\n",
       "      <td>2367011.0</td>\n",
       "      <td>高特灵</td>\n",
       "      <td>10.0</td>\n",
       "      <td>56.0</td>\n",
       "      <td>56.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6574</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6575</th>\n",
       "      <td>2018-04-27 星期三</td>\n",
       "      <td>1.008787e+10</td>\n",
       "      <td>2367011.0</td>\n",
       "      <td>高特灵</td>\n",
       "      <td>2.0</td>\n",
       "      <td>11.2</td>\n",
       "      <td>9.86</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6576</th>\n",
       "      <td>2018-04-27 星期三</td>\n",
       "      <td>1.340663e+07</td>\n",
       "      <td>2367011.0</td>\n",
       "      <td>高特灵</td>\n",
       "      <td>1.0</td>\n",
       "      <td>5.6</td>\n",
       "      <td>5.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6577</th>\n",
       "      <td>2018-04-28 星期四</td>\n",
       "      <td>1.192693e+07</td>\n",
       "      <td>2367011.0</td>\n",
       "      <td>高特灵</td>\n",
       "      <td>2.0</td>\n",
       "      <td>11.2</td>\n",
       "      <td>10.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6578 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                购药时间          社保卡号       商品编码           商品名称  销售数量   应收金额  \\\n",
       "0     2018-01-01 星期五  1.616528e+06   236701.0        强力VC银翘片   6.0   82.8   \n",
       "1     2018-01-01 星期五  1.014705e+08   236709.0            心痛定   4.0  179.2   \n",
       "2     2018-01-01 星期五  1.007261e+10  2367011.0            开博通   1.0   28.0   \n",
       "3     2018-01-01 星期五  1.007460e+10  2367011.0            开博通   5.0  140.0   \n",
       "4     2018-01-01 星期五  1.174343e+07   861405.0  苯磺酸氨氯地平片(络活喜)   1.0   34.5   \n",
       "...              ...           ...        ...            ...   ...    ...   \n",
       "6573             NaN  1.177863e+07  2367011.0            高特灵  10.0   56.0   \n",
       "6574             NaN           NaN        NaN            NaN   NaN    NaN   \n",
       "6575  2018-04-27 星期三  1.008787e+10  2367011.0            高特灵   2.0   11.2   \n",
       "6576  2018-04-27 星期三  1.340663e+07  2367011.0            高特灵   1.0    5.6   \n",
       "6577  2018-04-28 星期四  1.192693e+07  2367011.0            高特灵   2.0   11.2   \n",
       "\n",
       "        实收金额  \n",
       "0      69.00  \n",
       "1     159.20  \n",
       "2      25.00  \n",
       "3     125.00  \n",
       "4      31.00  \n",
       "...      ...  \n",
       "6573   56.00  \n",
       "6574     NaN  \n",
       "6575    9.86  \n",
       "6576    5.00  \n",
       "6577   10.00  \n",
       "\n",
       "[6578 rows x 7 columns]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.drop_duplicates(keep='first')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] No such file or directory: '../data/testData.xlsx'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[27], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m data \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_excel(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m../data/testData.xlsx\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;241m.\u001b[39mdrop(labels\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnone\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnone1\u001b[39m\u001b[38;5;124m'\u001b[39m],axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m      2\u001b[0m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtime\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mto_datetime(data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtime\u001b[39m\u001b[38;5;124m'\u001b[39m])\n",
      "File \u001b[1;32md:\\Anaconda3\\Lib\\site-packages\\pandas\\io\\excel\\_base.py:495\u001b[0m, in \u001b[0;36mread_excel\u001b[1;34m(io, sheet_name, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, date_format, thousands, decimal, comment, skipfooter, storage_options, dtype_backend, engine_kwargs)\u001b[0m\n\u001b[0;32m    493\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(io, ExcelFile):\n\u001b[0;32m    494\u001b[0m     should_close \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m--> 495\u001b[0m     io \u001b[38;5;241m=\u001b[39m ExcelFile(\n\u001b[0;32m    496\u001b[0m         io,\n\u001b[0;32m    497\u001b[0m         storage_options\u001b[38;5;241m=\u001b[39mstorage_options,\n\u001b[0;32m    498\u001b[0m         engine\u001b[38;5;241m=\u001b[39mengine,\n\u001b[0;32m    499\u001b[0m         engine_kwargs\u001b[38;5;241m=\u001b[39mengine_kwargs,\n\u001b[0;32m    500\u001b[0m     )\n\u001b[0;32m    501\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m engine \u001b[38;5;129;01mand\u001b[39;00m engine \u001b[38;5;241m!=\u001b[39m io\u001b[38;5;241m.\u001b[39mengine:\n\u001b[0;32m    502\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m    503\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEngine should not be specified when passing \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    504\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124man ExcelFile - ExcelFile already has the engine set\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    505\u001b[0m     )\n",
      "File \u001b[1;32md:\\Anaconda3\\Lib\\site-packages\\pandas\\io\\excel\\_base.py:1550\u001b[0m, in \u001b[0;36mExcelFile.__init__\u001b[1;34m(self, path_or_buffer, engine, storage_options, engine_kwargs)\u001b[0m\n\u001b[0;32m   1548\u001b[0m     ext \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mxls\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   1549\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1550\u001b[0m     ext \u001b[38;5;241m=\u001b[39m inspect_excel_format(\n\u001b[0;32m   1551\u001b[0m         content_or_path\u001b[38;5;241m=\u001b[39mpath_or_buffer, storage_options\u001b[38;5;241m=\u001b[39mstorage_options\n\u001b[0;32m   1552\u001b[0m     )\n\u001b[0;32m   1553\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m ext \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m   1554\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m   1555\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExcel file format cannot be determined, you must specify \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   1556\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124man engine manually.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   1557\u001b[0m         )\n",
      "File \u001b[1;32md:\\Anaconda3\\Lib\\site-packages\\pandas\\io\\excel\\_base.py:1402\u001b[0m, in \u001b[0;36minspect_excel_format\u001b[1;34m(content_or_path, storage_options)\u001b[0m\n\u001b[0;32m   1399\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(content_or_path, \u001b[38;5;28mbytes\u001b[39m):\n\u001b[0;32m   1400\u001b[0m     content_or_path \u001b[38;5;241m=\u001b[39m BytesIO(content_or_path)\n\u001b[1;32m-> 1402\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m get_handle(\n\u001b[0;32m   1403\u001b[0m     content_or_path, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrb\u001b[39m\u001b[38;5;124m\"\u001b[39m, storage_options\u001b[38;5;241m=\u001b[39mstorage_options, is_text\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m   1404\u001b[0m ) \u001b[38;5;28;01mas\u001b[39;00m handle:\n\u001b[0;32m   1405\u001b[0m     stream \u001b[38;5;241m=\u001b[39m handle\u001b[38;5;241m.\u001b[39mhandle\n\u001b[0;32m   1406\u001b[0m     stream\u001b[38;5;241m.\u001b[39mseek(\u001b[38;5;241m0\u001b[39m)\n",
      "File \u001b[1;32md:\\Anaconda3\\Lib\\site-packages\\pandas\\io\\common.py:882\u001b[0m, in \u001b[0;36mget_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[0;32m    873\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[0;32m    874\u001b[0m             handle,\n\u001b[0;32m    875\u001b[0m             ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    878\u001b[0m             newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m    879\u001b[0m         )\n\u001b[0;32m    880\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m    881\u001b[0m         \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m--> 882\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n\u001b[0;32m    883\u001b[0m     handles\u001b[38;5;241m.\u001b[39mappend(handle)\n\u001b[0;32m    885\u001b[0m \u001b[38;5;66;03m# Convert BytesIO or file objects passed with an encoding\u001b[39;00m\n",
      "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../data/testData.xlsx'"
     ]
    }
   ],
   "source": [
    "data = pd.read_excel('../data/testData.xlsx').drop(labels=['none','none1'],axis=1)\n",
    "data['time'] = pd.to_datetime(data['time'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "\"None of ['time'] are in the columns\"",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_1708\\692308226.py\u001b[0m in \u001b[0;36m?\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdata\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset_index\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'time'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0minplace\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      2\u001b[0m \u001b[0mdata1\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'ffill'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'bfill'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[0mdata1\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32md:\\Anaconda3\\Lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36m?\u001b[1;34m(self, keys, drop, append, inplace, verify_integrity)\u001b[0m\n\u001b[0;32m   6118\u001b[0m                     \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mfound\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   6119\u001b[0m                         \u001b[0mmissing\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcol\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   6120\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   6121\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mmissing\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 6122\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33mf\"\u001b[0m\u001b[1;33mNone of \u001b[0m\u001b[1;33m{\u001b[0m\u001b[0mmissing\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m are in the columns\u001b[0m\u001b[1;33m\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   6123\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   6124\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0minplace\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   6125\u001b[0m             \u001b[0mframe\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyError\u001b[0m: \"None of ['time'] are in the columns\""
     ]
    }
   ],
   "source": [
    "data.set_index('time',inplace=True)\n",
    "data1 = data.fillna(method='ffill',axis=0).fillna(method='bfill',axis=0)\n",
    "data1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 处理重复数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>15</td>\n",
       "      <td>17</td>\n",
       "      <td>41</td>\n",
       "      <td>35</td>\n",
       "      <td>73</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>45</td>\n",
       "      <td>88</td>\n",
       "      <td>92</td>\n",
       "      <td>99</td>\n",
       "      <td>36</td>\n",
       "      <td>57</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>40</td>\n",
       "      <td>40</td>\n",
       "      <td>27</td>\n",
       "      <td>64</td>\n",
       "      <td>22</td>\n",
       "      <td>98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>10</td>\n",
       "      <td>48</td>\n",
       "      <td>51</td>\n",
       "      <td>8</td>\n",
       "      <td>30</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>80</td>\n",
       "      <td>56</td>\n",
       "      <td>33</td>\n",
       "      <td>81</td>\n",
       "      <td>40</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1   2   3   4   5\n",
       "0  15  17  41  35  73  20\n",
       "1   1   1   1   1   1   1\n",
       "2  45  88  92  99  36  57\n",
       "3   1   1   1   1   1   1\n",
       "4  40  40  27  64  22  98\n",
       "5   1   1   1   1   1   1\n",
       "6  10  48  51   8  30  30\n",
       "7  80  56  33  81  40   8"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = DataFrame(data=np.random.randint(0,100,size=(8,6)))\n",
    "df.iloc[1] = [1,1,1,1,1,1]\n",
    "df.iloc[3] = [1,1,1,1,1,1]\n",
    "df.iloc[5] = [1,1,1,1,1,1]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    False\n",
       "1    False\n",
       "2    False\n",
       "3     True\n",
       "4    False\n",
       "5     True\n",
       "6    False\n",
       "7    False\n",
       "dtype: bool"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#检测哪些行存有重复的数据\n",
    "df.duplicated(keep='first')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>15</td>\n",
       "      <td>17</td>\n",
       "      <td>41</td>\n",
       "      <td>35</td>\n",
       "      <td>73</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>45</td>\n",
       "      <td>88</td>\n",
       "      <td>92</td>\n",
       "      <td>99</td>\n",
       "      <td>36</td>\n",
       "      <td>57</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>40</td>\n",
       "      <td>40</td>\n",
       "      <td>27</td>\n",
       "      <td>64</td>\n",
       "      <td>22</td>\n",
       "      <td>98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>10</td>\n",
       "      <td>48</td>\n",
       "      <td>51</td>\n",
       "      <td>8</td>\n",
       "      <td>30</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>80</td>\n",
       "      <td>56</td>\n",
       "      <td>33</td>\n",
       "      <td>81</td>\n",
       "      <td>40</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1   2   3   4   5\n",
       "0  15  17  41  35  73  20\n",
       "1   1   1   1   1   1   1\n",
       "2  45  88  92  99  36  57\n",
       "4  40  40  27  64  22  98\n",
       "6  10  48  51   8  30  30\n",
       "7  80  56  33  81  40   8"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[~df.duplicated(keep='first')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>15</td>\n",
       "      <td>17</td>\n",
       "      <td>41</td>\n",
       "      <td>35</td>\n",
       "      <td>73</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>45</td>\n",
       "      <td>88</td>\n",
       "      <td>92</td>\n",
       "      <td>99</td>\n",
       "      <td>36</td>\n",
       "      <td>57</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>40</td>\n",
       "      <td>40</td>\n",
       "      <td>27</td>\n",
       "      <td>64</td>\n",
       "      <td>22</td>\n",
       "      <td>98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>10</td>\n",
       "      <td>48</td>\n",
       "      <td>51</td>\n",
       "      <td>8</td>\n",
       "      <td>30</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>80</td>\n",
       "      <td>56</td>\n",
       "      <td>33</td>\n",
       "      <td>81</td>\n",
       "      <td>40</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1   2   3   4   5\n",
       "0  15  17  41  35  73  20\n",
       "1   1   1   1   1   1   1\n",
       "2  45  88  92  99  36  57\n",
       "4  40  40  27  64  22  98\n",
       "6  10  48  51   8  30  30\n",
       "7  80  56  33  81  40   8"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#异步到位删除\n",
    "df.drop_duplicates(keep='first')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 处理异常数据\n",
    "- 自定义一个1000行3列（A，B，C）取值范围为0-1的数据源，然后将C列中的值大于其两倍标准差的异常值进行清洗"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.808996</td>\n",
       "      <td>0.061100</td>\n",
       "      <td>0.719706</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.412267</td>\n",
       "      <td>0.215300</td>\n",
       "      <td>0.877308</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.225386</td>\n",
       "      <td>0.009403</td>\n",
       "      <td>0.190674</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.621006</td>\n",
       "      <td>0.470679</td>\n",
       "      <td>0.101402</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.285664</td>\n",
       "      <td>0.095644</td>\n",
       "      <td>0.754686</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          A         B         C\n",
       "0  0.808996  0.061100  0.719706\n",
       "1  0.412267  0.215300  0.877308\n",
       "2  0.225386  0.009403  0.190674\n",
       "3  0.621006  0.470679  0.101402\n",
       "4  0.285664  0.095644  0.754686"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = DataFrame(data=np.random.random(size=(1000,3)),columns=['A','B','C'])\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.574726831005339"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#制定判定异常值的条件\n",
    "twice_std = df['C'].std() * 2\n",
    "twice_std"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       True\n",
       "1       True\n",
       "2      False\n",
       "3      False\n",
       "4       True\n",
       "       ...  \n",
       "995     True\n",
       "996    False\n",
       "997     True\n",
       "998    False\n",
       "999     True\n",
       "Name: C, Length: 1000, dtype: bool"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['C'] > twice_std"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.225386</td>\n",
       "      <td>0.009403</td>\n",
       "      <td>0.190674</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.621006</td>\n",
       "      <td>0.470679</td>\n",
       "      <td>0.101402</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.534697</td>\n",
       "      <td>0.378318</td>\n",
       "      <td>0.236785</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0.608414</td>\n",
       "      <td>0.314417</td>\n",
       "      <td>0.362445</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>0.232701</td>\n",
       "      <td>0.238167</td>\n",
       "      <td>0.361937</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>984</th>\n",
       "      <td>0.126935</td>\n",
       "      <td>0.395630</td>\n",
       "      <td>0.276133</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>986</th>\n",
       "      <td>0.745662</td>\n",
       "      <td>0.770987</td>\n",
       "      <td>0.342238</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>989</th>\n",
       "      <td>0.875004</td>\n",
       "      <td>0.219291</td>\n",
       "      <td>0.263957</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>996</th>\n",
       "      <td>0.015172</td>\n",
       "      <td>0.329078</td>\n",
       "      <td>0.357025</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>998</th>\n",
       "      <td>0.525968</td>\n",
       "      <td>0.720884</td>\n",
       "      <td>0.153907</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>567 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            A         B         C\n",
       "2    0.225386  0.009403  0.190674\n",
       "3    0.621006  0.470679  0.101402\n",
       "6    0.534697  0.378318  0.236785\n",
       "9    0.608414  0.314417  0.362445\n",
       "12   0.232701  0.238167  0.361937\n",
       "..        ...       ...       ...\n",
       "984  0.126935  0.395630  0.276133\n",
       "986  0.745662  0.770987  0.342238\n",
       "989  0.875004  0.219291  0.263957\n",
       "996  0.015172  0.329078  0.357025\n",
       "998  0.525968  0.720884  0.153907\n",
       "\n",
       "[567 rows x 3 columns]"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1 = df.loc[~(df['C'] > twice_std)]\n",
    "df1"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  },
  "vscode": {
   "interpreter": {
    "hash": "d5d375f2d234d91eb9c1d04036871c68a0c11e8c2abf8991580b257f2c707712"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
